##
## This file contains all the R code used for the data reading, data wrangling, and data analysis
## for the article by A. M. Petitbon and D. B. Hitchcock,
## "What Kind of Music Do You Like? A Statistical Analysis of Music Genre Popularity Over Time".
##
## The code below is separated into eight blocks, each with a specific and self-contained purpose.
##
## Block 1: Reading the Data from External Raw Data Files
## Block 2: Creating Vectors of Response Counts for the Genres
## Block 3: Creating Candidate Predictor Time Series and Doing Model Selection
## Block 4: Diagnostic Residual Plots to Check Model 
## Block 5: In-sample Plots of Observed Genre Proportions and Model-Predicted Genre Probabilities
## Block 6: Boxplots of Forecasts of 2019 Genre Proportions for Hot 100 and Pazz & Jop
## Block 7: Time Series Analysis of Cramer's V Association Measure
## Block 8: Cross-Correlation Function for Cramer's V and "Proportion Physical" Variable
##
## To go to each block of code, do: Find in Page: ## BLOCK 1
## Find in Page: ## BLOCK 2
## etc.
##

#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 1 OF CODE
# This first block of code reads the raw data in from the external .csv files
# which can be found within the supplementary material.
# The original data lists 16 genres, but we collapsed it to 8 genres.
# The eventual output of this block of code is a data frame contianing the genre counts for the years 1974 to 2018.
#######################################################################################
#######################################################################################
#######################################################################################

##Read in data

# Hot 100
##----------
# Read in the data.
Hot10074.16<-read.csv("Hot100_annual_with_genre.csv",header=TRUE)
Hot10017.18<-read.csv("Billboard_annual_2017_2020_with_genres.csv",header=TRUE)
Hot10074.18<-rbind(Hot10074.16[,c(4,6)],Hot10017.18[1:200,c(7,8)])
names(Hot10074.18)<-c("Year","Genre")

# Create the collapsed 8-genre list by combining the original 16 genre designations.
genres.wide<-with(Hot10074.18,table(Year,Genre))
rock.comb<-genres.wide[,"Rock"]+genres.wide[,"Rock & Roll"]+genres.wide[,"Metal"]+genres.wide[,"Punk"]
pop.comb<-genres.wide[,"Pop"]+genres.wide[,"Pop Standards"]
RB.comb<-genres.wide[,"R&B"]+genres.wide[,"Soul"]
other.comb<-genres.wide[,"Latin"]+genres.wide[,"Jazz"]+genres.wide[,"Blues"]+genres.wide[,"Ska/Reggae/Dancehall"]
genres.wide.new<-cbind(genres.wide[,2:4],pop.comb,genres.wide[,12],rock.comb,RB.comb,other.comb)

# Rename data frame column names to reflect the 8 new collapsed genres. Check for completion.
dimnames(genres.wide.new)[[2]]<-c("Country","Folk","Elec","Pop","Rap/Hip-Hop","Rock","RBSoul","Other")
head(genres.wide.new)

# Retrieve the 2019 genre counts to later create the 2019 Proportions (used later in the boxplots).
Hot10017.20<-read.csv("Billboard_annual_2017_2020_with_genresUPDATE.csv",header=TRUE)
Hot10017.20<-Hot10017.20[,c(7,8)]
names(Hot10017.20)<-c("Year","Genre")

Hot100.19<-Hot10017.20[(201:300),]
genres19<-table(Hot100.19)

# Pazz & Jop ----- DH needs to help here!
##----------
#Read in the Excel files for the separate "Album" and "Single" tabs.
library(readxl)

# Read in "Single" sheet genre designations for 1974-2016.
xl_data<-"Pazz&Jop_singles_data.xlsx"
table.genres.PJ_single<-read_excel(path=xl_data,sheet="Single")
genres.PJsingle<-table.genres.PJ_single[,c(3,4)]
head(genres.PJsingle)
#look at underline and what it means???
genres.PJsingle[1,2]


# Read in "Album" sheet genre designations for 1974-2016.
xl_data2<-"Pazz&Jop_Albums_data.xlsx"
tables.genres.PJ_album<-read_excel(path=xl_data2,sheet="Album")
genres.PJalbum<-tables.genres.PJ_album[,c(3,4)]
head(genres.PJalbum)

# Read in 2017-2020 data
x1_data3<-"PazzJop_2017-2020.xlsx"
# Genre counts were added to the 1974-2016 table from this Excel file

# Make counts tables from the "Single" and "Album" data frames, and transpose both counts tables.
genre_counts_PJsingle<-t(table(genres.PJsingle))
genre_counts_PJalbum<-t(table(genres.PJalbum))

# Adjust for the first column being different in "Single" and "Album" entries since singles' rankings started 5 years after albums.
genrePJ_temp<-cbind(rep(0,38),genre_counts_PJsingle[,1:6],rep(0,38),genre_counts_PJsingle[,7:11],rep(0,38),genre_counts_PJsingle[,12:13])

dimnames(genrePJ_temp)[[2]]<-dimnames(genre_counts_PJalbum)$Genre

# Create an empty matrix.
# Rows of zeros were added to the first 5 years for the matrix of "Singles" counts. 
# Later, we will add the "Singles" counts to the "Albums" counts, and this makes them the same dimension.
filler_rows<-matrix(0,nr=5,nc=16)
genre_counts_PJsingle_full<-rbind(filler_rows,genrePJ_temp)
head(genre_counts_PJsingle_full)

dimnames(genre_counts_PJsingle_full)[[1]]<-dimnames(genre_counts_PJalbum)$Year

genre_counts_PJtotal<-genre_counts_PJsingle_full+genre_counts_PJalbum
head(genre_counts_PJtotal)

rowSums(genre_counts_PJtotal)

#Combining/condensing genres
rock.combPJ<-genre_counts_PJtotal[,"Rock"]+genre_counts_PJtotal[,"Rock & Roll"]
pop.combPJ<-genre_counts_PJtotal[,"Pop"]+genre_counts_PJtotal[,"Pop Standards"]
RB.combPJ<-genre_counts_PJtotal[,"R&B"]+genre_counts_PJtotal[,"Soul"]
other.combPJ<-genre_counts_PJtotal[,"Latin"]+genre_counts_PJtotal[,"Jazz"]+genre_counts_PJtotal[,"Blues"]+genre_counts_PJtotal[,"Ska/Reggae/Dancehall"]

#Redefining dataframe with condensed genres
genres_PJtotal.new<-cbind(genre_counts_PJtotal[,2:4],genre_counts_PJtotal[,7],pop.combPJ,genre_counts_PJtotal[,10],genre_counts_PJtotal[,12],rock.combPJ,RB.combPJ,other.combPJ)

#Redefining dataframe with condensed genres
dimnames(genres_PJtotal.new)[[2]]<-c("Country","Folk","Elec","Metal","Pop","Punk","Rap/Hip-Hop","Rock","RBSoul","Other")


#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 2 OF CODE

# This block of code takes the data frame of genre counts over the years and 
# creates response variables consisting of the series of counts over the years, for each genre.
# Because our model uses lagged dependent variables, we create series for the full set of years (1974-2018),
# series for the years 1975-2018, and lagged series for the years 1974-2017.
# These series are created for both the Hot 100 counts and for the Pazz & Jop counts.
#######################################################################################
#######################################################################################
#######################################################################################

##_______________________________________________________________________________________________________________
## Responses and Lag-1 Responses

# Hot 100
##----------
# Read in the Hot 100 genre counts.
genres.wide.new <- read.table("Hot100counts.txt",header=T,skip=1)

# Create a variable for the 1974-2018 genre counts belonging to each of the 8 genres; i.e. Y1, Y2, ... , Y10
# Create a variable for the shortened 1975-2018 genre counts belonging to each of the 8 genres; i.e. Y1.s, Y2.s, ... , Y10.s
# Create a variable for the lag-1 1974-2017 genre counts belonging to each of the 8 genres; i.e. Y1.L1, Y2.L1, ... , Y10.L1

Y1<-genres.wide.new[,"Country"]
Y1.s<-Y1[-1]
Y1.L1<-Y1[1:44]

Y2<-genres.wide.new[,"Folk"]
Y2.s<-Y2[-1]
Y2.L1<-Y2[1:44]

Y3<-genres.wide.new[,"Elec"]
Y3.s<-Y3[-1]
Y3.L1<-Y3[1:44]

Y5<-genres.wide.new[,"Pop"]
Y5.s<-Y5[-1]
Y5.L1<-Y5[1:44]

Y7<-genres.wide.new[,"Rap/Hip-Hop"]
Y7.s<-Y7[-1]
Y7.L1<-Y7[1:44]

# NOTE: The numbering of the Y responses is not arbitrary.
# Original analysis included 10 genres. However, the final analysis includes only 8 genres.
# Genres Y4, Y6, and Y8 were collapsed into the "Rock" genre in the genre counts table. 
# Hence, the Y486 prefix is for "Rock" throughout the Hot 100 related code.
Y486<-genres.wide.new[,"Rock"]
Y486.s<-Y486[-1]
Y486.L1<-Y486[1:44]

Y9<-genres.wide.new[,"RBSoul"]
Y9.s<-Y9[-1]
Y9.L1<-Y9[1:44]

Y10<-genres.wide.new[,"Other"]
Y10.s<-Y10[-1]
Y10.L1<-Y10[1:44]

# Pazz & Jop
##----------
# Read in the Pazz & Jop genre counts.
genres_PJtotal.new <- read.table("PazzJopcounts",header=T,skip=1)

# Create a variable for the 1974-2018 genre counts belonging to each of the 8 genres; i.e. y1, y2, ... , y10
# Create a variable for the shorted 1975-2018 genre counts belonging to each of the 8 genres; i.e. y1.s, y2.s, ... , y10.s
# Create a variable for the lag-1 1974-2017 genre counts belonging to each of the 8 genres; i.e. y1.L1, y2.L1, ... , y10.L1

y1<-genres_PJtotal.new[,"Country"]
y1.s<-y1[-1]
y1.L1<-y1[1:44]

y2<-genres_PJtotal.new[,"Folk"]
y2.s<-y2[-1]
y2.L1<-y2[1:44]

y3<-genres_PJtotal.new[,"Elec"]
y3.s<-y3[-1]
y3.L1<-y3[1:44]

y5<-genres_PJtotal.new[,"Pop"]
y5.s<-y5[-1]
y5.L1<-y5[1:44]

y7<-genres_PJtotal.new[,"Rap.Hip.Hop"]
y7.s<-y7[-1]
y7.L1<-y7[1:44]

# NOTE: The numbering of the y responses is not arbitrary.
# Original analysis included 10 genres. However, the final analysis includes only 8 genres.
# Genres y4, y6, and y8 were collapsed into the "Rock" genre in the genre counts table. 
# Hence, the y468 prefix is for "Rock" throughout the Pazz & Jop related code.
y468<-genres_PJtotal.new[,"Rock"]
y468.s<-y468[-1]
y468.L1<-y468[1:44]

y9<-genres_PJtotal.new[,"RBSoul"]
y9.s<-y9[-1]
y9.L1<-y9[1:44]

y10<-genres_PJtotal.new[,"Other"]
y10.s<-y10[-1]
y10.L1<-y10[1:44]


#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 3 OF CODE
# This block of code creates vectors that are the candidate predictor variables in our models.
# For most candidate predictors, we consider both unlagged versions (1975-2018) and
# lagged versions (1974-2017) for inclusion in the model.
# This set of potential predictors includes vectors representing (centered) time, time^2, and time^3.
# The block of code also fits the VGLM (multinomial logit) model with all possible subsets of predictors,
# not including time itself.  The AIC and BIC are calculated for each model and 
# are used to help guide the choice of the best model.
#######################################################################################
#######################################################################################
#######################################################################################

##_______________________________________________________________________________________________________________
## Fit all subsets VGLM Models

# Hot 100
##----------
library(VGAM)

# Create time variables.
# Time is measured from years 1975 through 2018 in terms of the number of years since/centered around 1996 (midpoint). 
time <- seq(-21,22,by=1)
timesq <- time^2
timecube<-time^3
 
# Read-in/Create potential predictors and lag-1 predictors.
# Code to create objects that are annual predictor variables located in Supplementary Material.
# See code_to_get_annual_predictors_time_series.txt

# Inflation
infl<-as.vector(inflation_yr_1974_2018)
infl.s<-infl[-1]
infl.L1 <- infl[1:44]

# Unemployment
unemp <- as.vector(unemp_annual)
unemp.s<-unemp[-1]
unemp.L1 <- unemp[1:44]

# GDP
gdp<-gdp_growth_1974_2018
gdp.s<-gdp[-1]
gdp.L1<-gdp[1:44]

# Political factor
polit.s <- polit_pred[-1]
polit.L1 <- polit_pred[1:44]

## For the music-industry predictor variables, 
## we do not include lagged versions.
## We don't believe the time lag will exist like it might 
## for the economic and demographic and political predictors.

# Proportion Physical
phys_1975_2018<-scan("phys.txt")
phys.s <- phys_1975_2018[phys_1975_2018<2]  
# no reason to use a lagged predictor for this one...
# we do not believe the effect of proportion of physical sales will include a time lag,
# so we did not consider the lagged version in the model.

# Sales Adjusted for Inflation 
sales_adj_1975_2018 <- scan("Sales_adj_v1.txt")
sales_adj.s <- sales_adj_1975_2018[sales_adj_1975_2018 > 2500] 
# we do not believe the effect of adjusted sales will include a time lag,
# so we did not consider the lagged version in the model.


# Define the candidate predictors:
preds <-c('unemp.s','unemp.L1','polit.s', 'polit.L1', 'infl.s', 'infl.L1','gdp.s', 'gdp.L1','phys.s','sales_adj.s','sales_adj.L1')

# Set up null vectors that will be filled in with the combinations of predictors and the corresponding BIC values:
temp.preds.all <- NULL
tempBIC.all <-NULL
tempAIC.all <-NULL

# Loop over all possible numbers of variables in the model:
for (i in 1:length(preds)){

# For a fixed number of variables, get all the combinations of predictors for that model size:
temp.list <- combn(preds,m=i)
# The columns of temp.list are the various combinations of predictors.

# Loop across the columns of temp.list, taking each combination of predictors one at a time:
for (j in 1:ncol(temp.list)){

# Pasting the predictor names into a single string, with the names separated by the + symbol:
temp.preds<-paste(temp.list[,j], sep = '', collapse = '+')

# Pasting the response part of the formula and the predictors part of the formula
# into one character string that will define the model formula:
temp.formula<-paste0('cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~',paste(temp.list[,j], sep = '', collapse = '+'))

# temp.preds.all will collect all the predictor variable combinations tried:
temp.preds.all <- c(temp.preds.all,temp.preds)

# Fitting the multinomial logit model with the currently defined model formula:
fit <- vglm(formula = as.formula(temp.formula), family=multinomial)

# Getting the BIC for that model fit:
tempBIC <- BIC(fit)
tempAIC <- AIC(fit)

# Storing all the BIC values for every model tried:
tempBIC.all <- c(tempBIC.all,tempBIC)
tempAIC.all <- c(tempAIC.all,tempAIC)
}
}

# Putting the variable combinations and the corresponding BIC values into one data frame:
myresults<-data.frame(temp.preds.all,tempBIC.all,tempAIC.all)

# Ordering the rows from smallest BIC to largest:
myresults.ord<- myresults[order(tempBIC.all),]

# Printing out the top 20 models along with their BIC values:
print(myresults.ord[1:20,])


# Top 5 BICs:

top.BICs<-myresults.ord[1:5,2]

delta.j<-top.BICs-min(top.BICs)
post.probs<-exp(delta.j/(-2))/sum(exp(delta.j/(-2)) )
#print(post.probs)
#approx. posterior prob that the model is THAT (ordinal) model based on BIC

# Pazz & Jop
##----------
library(VGAM)

# Create time variables.
# Time is measured from years 1975 through 2018 in terms of the number of years since/centered around 1996. 
time <- seq(-21,22,by=1)
timesq <- time^2
timecube<-time^3

# Read-in/Create potential predictors and lag-1 predictors.
# Code to create objects that are annual predictor variables located in Supplementary Material.
# See code_to_get_annual_predictors_time_series.txt

# Inflation
infl<-as.vector(inflation_yr_1974_2018)
infl.s<-infl[-1]
infl.L1 <- infl[1:44]

# Unemployment
unemp <- as.vector(unemp_annual)
unemp.s<-unemp[-1]
unemp.L1 <- unemp[1:44]

# GDP
gdp<-gdp_growth_1974_2018
gdp.s<-gdp[-1]
gdp.L1<-gdp[1:44]

# Political factor
polit.s <- polit_pred[-1]
polit.L1 <- polit_pred[1:44]

## For the music-industry predictor variables, 
## we do not include lagged versions.
## We don't believe the time lag will exist like it might 
## for the economic and demographic and political predictors.


# Proportion Physical
phys_1975_2018<-scan("phys.txt")
phys.s <- phys_1975_2018[phys_1975_2018<2]  
# no reason to use a lagged predictor for this one...
# we do not believe the effect of proportion of physical sales will include a time lag,
# so we did not consider the lagged version in the model.

# Sales Adjusted for Inflation

sales_adj_1975_2018 <- scan("Sales_adj_v1.txt")
sales_adj.s <- sales_adj_1975_2018[sales_adj_1975_2018 > 2500] 
# we do not believe the effect of adjusted sales will include a time lag,
# so we did not consider the lagged version in the model.

# Define the candidate predictors:
preds <-c('unemp.s','unemp.L1','polit.s', 'polit.L1', 'infl.s', 'infl.L1','gdp.s', 'gdp.L1','phys.s','sales_adj.s','sales_adj.L1')

# Set up null vectors that will be filled in with the combinations of predictors and the corresponding BIC values:
temp.preds.all <- NULL
tempBIC.all <-NULL
tempAIC.all <-NULL

# Loop over all possible numbers of variables in the model:
for (i in 1:length(preds)){

# For a fixed number of variables, get all the combinations of predictors for that model size:
temp.list <- combn(preds,m=i)
# The columns of temp.list are the various combinations of predictors.

# Loop across the columns of temp.list, taking each combination of predictors one at a time:
for (j in 1:ncol(temp.list)){

# Pasting the predictor names into a single string, with the names separated by the + symbol:
temp.preds<-paste(temp.list[,j], sep = '', collapse = '+')

# Pasting the response part of the formula and the predictors part of the formula
# into one character string that will define the model formula:
temp.formula<-paste0('cbind(y1.s,y2.s,y3.s,y5.s,y7.s,y9.s,y10.s,y468.s)~',paste(temp.list[,j], sep = '', collapse = '+'))


# temp.preds.all will collect all the predictor variable combinations tried:
temp.preds.all <- c(temp.preds.all,temp.preds)

# Fitting the multinomial logit model with the currently defined model formula:
fit <- vglm(formula = as.formula(temp.formula), family=multinomial)

# Getting the BIC for that model fit:
tempBIC <- BIC(fit)
tempAIC <- AIC(fit)

# Storing all the BIC values for every model tried:
tempBIC.all <- c(tempBIC.all,tempBIC)
tempAIC.all <- c(tempAIC.all,tempAIC)
}
}

# Putting the variable combinations and the corresponding BIC values into one data frame:
myresults<-data.frame(temp.preds.all,tempBIC.all,tempAIC.all)

# Ordering the rows from smallest BIC to largest:
myresults.ord<- myresults[order(tempBIC.all),]

# Printing out the top 20 models along with their BIC values:
print(myresults.ord[1:20,])


# Top 5 BICs:

top.BICs<-myresults.ord[1:5,2]

delta.j<-top.BICs-min(top.BICs)
post.probs<-exp(delta.j/(-2))/sum(exp(delta.j/(-2)) )
#print(post.probs)
#approx. posterior prob that the model is THAT (ordinal) model based on BIC


#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 4 OF CODE
# This block of code produces various diagnostic residual plots to check to adequacy of the models,
# both for the Hot 100 model and the Pazz & Jop model.
# Some of the plots produced include:  time series plots of the raw residuals (not as useful),
# time series plots of the Pearson residuals, autocorrelation function (ACF) plots of the Pearson residuals,
# and scatterplots of the Pearson residuals against the lag-1 versions of such residuals.
#######################################################################################
#######################################################################################
#######################################################################################


##_______________________________________________________________________________________________________________
##ACF TS Plots of Residuals 

# Hot 100
#----------
# Using ggplot2 (June 2021):
library(ggplot2)


# Define time.
Year <- 1975:2018

# Plotting residuals over time.
# Pearson residuals for "best" model. Change the name of the "best" model here as needed; i.e. fitbestLDVsH1003
dev.new()
resp.residsH100<-resid(fitbestLDVsH1003, type = "pearson") 

# Despite using the dimension names of the Pazz & Jop data here, they match the Hot 100 data because its the same genres.
resp.resids.long <- data.frame(I(rep(Year,times=8)),I(as.vector(resp.residsH100)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10,8)],each=44)) )
names(resp.resids.long) <- c("Year","resp.resids.vec","genre.type")

P1 <- ggplot(data=resp.resids.long, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vec)) +
  geom_point(aes(y=resp.resids.vec)) + geom_hline(aes(yintercept=0))

P2 <- P1 + facet_wrap(vars(resp.resids.long$genre.type),nrow=4,ncol=2, scales="free_y")
P2

P2 + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Hot 100: Time series plot of residuals by genre") + ylab("Residuals")

dev.new()
library(forecast);library(tidyr)
# ACF plots

P3 <- ggAcf(resp.resids.long$resp.resids.vec[1:44], lag.max=16)+ theme(legend.title = element_blank()) + theme(legend.position="top") + ggtitle("Country")
P3
 
P4 <- ggAcf(resp.resids.long$resp.resids.vec[45:88], lag.max=16)+ theme(legend.title = element_blank()) + theme(legend.position="top") + ggtitle("Folk")
P4
 
P5 <- ggAcf(resp.resids.long$resp.resids.vec[89:132], lag.max=16)+ theme(legend.title = element_blank()) + theme(legend.position="top") + ggtitle("Elec")
P5

P6 <- ggAcf(resp.resids.long$resp.resids.vec[133:176], lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("Pop")
P6

P8 <- ggAcf(resp.resids.long$resp.resids.vec[177:220], lag.max=16)+ theme(legend.title = element_blank()) +
   theme(legend.position="top") + ggtitle("Rap/Hip-Hop")

P9 <- ggAcf(resp.resids.long$resp.resids.vec[221:264], lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("RBSoul")
 
P10 <- ggAcf(resp.resids.long$resp.resids.vec[265:308], lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("Other")


P11 <- ggAcf(resp.resids.long$resp.resids.vec[309:352], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Rock")

multiplot(P3,P4,P6,P9,P5,P10,P8,P11, cols=2)  #trial and error for correct order and number of columns...

# Pazz & Jop
#----------
# Using ggplot2 (June 2021):
library(ggplot2)

# Define time.
Year <- 1975:2018

# Plotting residuals over time.
# Response residuals for "best" model. Change the name of the "best" model here as needed; i.e. fitbestLDVsB2
dev.new()
resp.residsPJ<-resid(fitbestLDVsB2, type = "response")

resp.resids.long <- data.frame(I(rep(Year,times=8)),I(as.vector(resp.residsPJ)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10,8)],each=44)) )
names(resp.resids.long) <- c("Year","resp.resids.vec","genre.type")

P1 <- ggplot(data=resp.resids.long, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vec)) +
  geom_point(aes(y=resp.resids.vec)) + geom_hline(aes(yintercept=0))

P2 <- P1 + facet_wrap(vars(resp.resids.long$genre.type),nrow=4,ncol=2, scales="free_y")
P2

P2 + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Pazz and Jop: Time series plot of residuals by genre") + ylab("Residuals")

dev.new()
library(forecast);library(tidyr)
# ACF plots

P3 <- ggAcf(resp.resids.long$resp.resids.vec[1:44], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Country")
# P3
 
 P4 <- ggAcf(resp.resids.long$resp.resids.vec[45:88], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Folk")
# P4
 
 P5 <- ggAcf(resp.resids.long$resp.resids.vec[89:132], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Elec")
# P5
 
 P6 <- ggAcf(resp.resids.long$resp.resids.vec[133:176], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Pop")
# P6
 
## Had to rearrange the ggtitle labels for the last three plots

 P8 <- ggAcf(resp.resids.long$resp.resids.vec[177:220], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Rap/Hip-Hop")
 
 P9 <- ggAcf(resp.resids.long$resp.resids.vec[221:264], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("RBSoul")
 
P10 <- ggAcf(resp.resids.long$resp.resids.vec[265:308], lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("Other")


P11 <- ggAcf(resp.resids.long$resp.resids.vec[309:352], lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Rock")

# Had to change the order of the plots below a bit
multiplot(P3,P4,P6,P9,P5,P10,P8,P11, cols=2)  #trial and error for correct order and number of columns...

##______________________________________________________________________________________________________________
##Pearson Residuals/Zlag Time Series Plots

# Hot 100
##----------
# Using ggplot2 (June 2021):
library(ggplot2)


# Define time.
Year <- 1975:2018

dev.new()

# Change the name of the "best" model here as needed; i.e. fitbestLDVsH100
resp.residsH100<-resid(fitbestLDVsH100, type = "response") 
resp.residsH100pearson<-resid(fitbestLDVsH100, type = "pearson") # Using Pearson resids here (more justified for GLMs)

residsPearson <- data.frame(resp.residsH100pearson)
names(residsPearson) <- c('rpCountry','rpFolk','rpElec','rpPop','rpRapHH','rpRBSoul','rpOther')

# Plot of Pearson residuals vs fitted values (checks overall model fit)
#plot(mypredmat[,-8],resp.residsH100pearson, xlab="Model Fitted Values", ylab="Pearson Residuals")

for.overall.fit <- data.frame(cbind(as.vector(mypredmat[,-8]),as.vector(resp.residsH100pearson)))
names(for.overall.fit)<-c('fittedvals','pearsonresids')

P0p <- ggplot(for.overall.fit,aes(x=fittedvals, y=pearsonresids)) + geom_point() + ggtitle("Pearson Residuals vs. Fitted Values") +xlab("Fitted Values") + ylab("Pearson Residuals")

# This plots the response residuals over time.
resp.resids.long <- data.frame(I(rep(Year,times=8)),I(as.vector(resp.residsH100)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10,8)],each=44)) )
names(resp.resids.long) <- c("Year","resp.resids.vec","genre.type")

P1 <- ggplot(data=resp.resids.long, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vec)) +
  geom_point(aes(y=resp.resids.vec)) + geom_hline(aes(yintercept=0))

P2 <- P1 + facet_wrap(vars(resp.resids.long$genre.type),nrow=4,ncol=2, scales="free_y")
#P2

P2 + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Hot 100: Time series plot of response residuals by genre") + ylab("Response Residuals")

dev.new()
# This plots the Pearson residuals over time.
# Note the "times" agument in the rep function below is 7, even though there are 8 genres, 
# because the Pearson residuals are only defined for the non-baseline categories (not the baseline category of Rock)
resp.resids.longH <- data.frame(I(rep(Year,times=7)),I(as.vector(resp.residsH100pearson)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10)],each=44)) )
names(resp.resids.longH) <- c("Year","resp.resids.vecH","genre.type")

P1pts <- ggplot(data=resp.resids.longH, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vecH)) +
  geom_point(aes(y=resp.resids.vecH)) + geom_hline(aes(yintercept=0))

P2pts <- P1pts + facet_wrap(vars(resp.resids.longH$genre.type),nrow=4,ncol=2, scales="free_y")
#P2

P2pts + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Hot 100: Time series plot of Pearson residuals by genre") + ylab("Pearson Residuals")


dev.new()
library(forecast);library(tidyr);library(TSA)
# Scatter plots of Pearson resids vs lag-1 Pearson resids:

P1p <- ggplot(residsPearson,aes(x=zlag(rpCountry), y=rpCountry)) + geom_point() + ggtitle("Country") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P2p <- ggplot(residsPearson,aes(x=zlag(rpFolk), y=rpFolk)) + geom_point() + ggtitle("Folk") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P3p <- ggplot(residsPearson,aes(x=zlag(rpElec), y=rpElec)) + geom_point() + ggtitle("Electronic") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P4p <- ggplot(residsPearson,aes(x=zlag(rpPop), y=rpPop)) + geom_point() + ggtitle("Pop") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P5p <- ggplot(residsPearson,aes(x=zlag(rpRapHH), y=rpRapHH)) + geom_point() + ggtitle("Rap/Hip-Hop") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P6p <- ggplot(residsPearson,aes(x=zlag(rpRBSoul), y=rpRBSoul)) + geom_point() + ggtitle("RBSoul") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P7p <- ggplot(residsPearson,aes(x=zlag(rpOther), y=rpOther)) + geom_point() + ggtitle("Other") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


# Had to change the order of the plots below a bit
multiplot(P1p,P2p,P4p,P6p,P3p,P7p,P5p,P0p, cols=2)  #trial and error for correct order and number of columns...


dev.new()

### ACFs for Pearson Residuals 

library(forecast);library(tidyr)
# ACF plots

P1acf <- ggAcf(residsPearson$rpCountry, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Country")

 
 P2acf <- ggAcf(residsPearson$rpFolk, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Folk")

 
 P3acf <- ggAcf(residsPearson$rpElec, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Elec")

 
 P4acf <- ggAcf(residsPearson$rpPop, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Pop")

 
 P5acf <- ggAcf(residsPearson$rpRapHH, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Rap/Hip-Hop")
 
 P6acf <- ggAcf(residsPearson$rpRBSoul, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("RBSoul")
 
P7acf <- ggAcf(residsPearson$rpOther, lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("Other")


# Had to change the order of the plots below a bit
multiplot(P1acf,P2acf,P4acf,P6acf,P3acf,P7acf,P5acf, cols=2)   #trial and error for correct order and number of columns...

## Ignore this if using R Studio...
# pdf("C:/Users/hitchcod/Desktop/scatter_Pearson_resids_PJ_B2_final.pdf")
## Plotting code...
# dev.off()

# Pazz & Jop
#-----------
# Using ggplot2 (June 2021):
library(ggplot2)


# Define time.
Year <- 1975:2018

dev.new()

# Change the name of the "best" model here as needed; i.e. fitbestLDVsB2
resp.residsPJ<-resid(fitbestLDVsB2, type = "response")
resp.residsPJpearson<-resid(fitbestLDVsB2, type = "pearson") # Using Pearson resids here (more justified for GLMs)

residsPearson <- data.frame(resp.residsPJpearson)
names(residsPearson) <- c('rpCountry','rpFolk','rpElec','rpPop','rpRapHH','rpRBSoul','rpOther')

# Plot of Pearson residuals vs fitted values (checks overall model fit)
#plot(mypredmat[,-8],resp.residsPJpearson, xlab="Model Fitted Values", ylab="Pearson Residuals")

for.overall.fit <- data.frame(cbind(as.vector(mypredmat[,-8]),as.vector(resp.residsPJpearson)))
names(for.overall.fit)<-c('fittedvals','pearsonresids')

P0p <- ggplot(for.overall.fit,aes(x=fittedvals, y=pearsonresids)) + geom_point() + ggtitle("Pearson Residuals vs. Fitted Values") +xlab("Fitted Values") + ylab("Pearson Residuals")

# This plots the response residuals over time. 
resp.resids.long <- data.frame(I(rep(Year,times=8)),I(as.vector(resp.residsPJ)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10,8)],each=44)) )
names(resp.resids.long) <- c("Year","resp.resids.vec","genre.type")

P1 <- ggplot(data=resp.resids.long, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vec)) +
  geom_point(aes(y=resp.resids.vec)) + geom_hline(aes(yintercept=0))

P2 <- P1 + facet_wrap(vars(resp.resids.long$genre.type),nrow=4,ncol=2, scales="free_y")
#P2

P2 + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Pazz and Jop: Time series plot of response residuals by genre") + ylab("Response Residuals")

dev.new()
# This plots the Pearson residuals over time.
# Note the "times" agument in the rep function below is 7, even though there are 8 genres, 
# because the Pearson residuals are only defined for the non-baseline categories (not the baseline category of Rock)

resp.resids.longP <- data.frame(I(rep(Year,times=7)),I(as.vector(resp.residsPJpearson)),I(rep(dimnames(genres_PJtotal.new)[[2]][c(1,2,3,5,7,9,10)],each=44)) )
names(resp.resids.longP) <- c("Year","resp.resids.vecP","genre.type")

P1pts <- ggplot(data=resp.resids.longP, aes(x=Year)) +
  geom_line(aes(y=resp.resids.vecP)) +
  geom_point(aes(y=resp.resids.vecP)) + geom_hline(aes(yintercept=0))

P2pts <- P1pts + facet_wrap(vars(resp.resids.longP$genre.type),nrow=4,ncol=2, scales="free_y")
#P2

P2pts + theme(legend.title = element_blank()) +
theme(legend.position="top") + ggtitle("Pazz and Jop: Time series plot of Pearson residuals by genre") + ylab("Pearson Residuals")



dev.new()
library(forecast);library(tidyr);library(TSA)
# Scatter plots of Pearson resids vs lag-1 Pearson resids:

P1p <- ggplot(residsPearson,aes(x=zlag(rpCountry), y=rpCountry)) + geom_point() + ggtitle("Country") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P2p <- ggplot(residsPearson,aes(x=zlag(rpFolk), y=rpFolk)) + geom_point() + ggtitle("Folk") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P3p <- ggplot(residsPearson,aes(x=zlag(rpElec), y=rpElec)) + geom_point() + ggtitle("Electronic") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P4p <- ggplot(residsPearson,aes(x=zlag(rpPop), y=rpPop)) + geom_point() + ggtitle("Pop") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P5p <- ggplot(residsPearson,aes(x=zlag(rpRapHH), y=rpRapHH)) + geom_point() + ggtitle("Rap/Hip-Hop") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P6p <- ggplot(residsPearson,aes(x=zlag(rpRBSoul), y=rpRBSoul)) + geom_point() + ggtitle("RBSoul") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


P7p <- ggplot(residsPearson,aes(x=zlag(rpOther), y=rpOther)) + geom_point() + ggtitle("Other") +xlab("Lag-1 Pearson Residuals") + ylab("Pearson Residuals")


# Had to change the order of the plots below a bit
multiplot(P1p,P2p,P4p,P6p,P3p,P7p,P5p,P0p, cols=2)  #trial and error for correct order and number of columns...


dev.new()

### ACFs for Pearson Residuals 

library(forecast);library(tidyr)
# ACF plots

P1acf <- ggAcf(residsPearson$rpCountry, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Country")

 
 P2acf <- ggAcf(residsPearson$rpFolk, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Folk")

 
 P3acf <- ggAcf(residsPearson$rpElec, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Elec")

 
 P4acf <- ggAcf(residsPearson$rpPop, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Pop")

 
 P5acf <- ggAcf(residsPearson$rpRapHH, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("Rap/Hip-Hop")
 
 P6acf <- ggAcf(residsPearson$rpRBSoul, lag.max=16)+ theme(legend.title = element_blank()) +
     theme(legend.position="top") + ggtitle("RBSoul")
 
P7acf <- ggAcf(residsPearson$rpOther, lag.max=16)+ theme(legend.title = element_blank()) +
    theme(legend.position="top") + ggtitle("Other")


# Had to change the order of the plots below a bit
multiplot(P1acf,P2acf,P4acf,P6acf,P3acf,P7acf,P5acf, cols=2) #trial and error for correct order and number of columns...

## Ignore this if using R Studio...
# pdf("C:/Users/hitchcod/Desktop/scatter_Pearson_resids_PJ_B2_final.pdf")
## Plotting code...
# dev.off()

#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 5 OF CODE
# This block of code makes separate time series plots of observed genre proportions from 1975-2018 for each of the 8 genres,
# and time series plots of the model-predicted genre probabilities.  
# The purpose is to check the in-sample fit of the models.
# In-sample plots are given for both the Hot 100 and Pazz & Jop data/models.
# Within this block of code, the various choices for "best" models for the Hot 100 and Pazz & Jop
# are fit, and the model-predicted probabilities are obtained.
#######################################################################################
#######################################################################################
#######################################################################################

##______________________________________________________________________________________________________________
##In-Sample Predictions & Plots

# Hot 100
------------
#4 X 2 array of plots

## Fitting some candidate models:

# Excludes lag-1 responses as predictors in the model:
fitbestH100 <- vglm(formula = cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~unemp.s+polit.L1+infl.L1+gdp.s+phys.s+sales_adj.s+sales_adj.L1, family=multinomial)

# Added .01 to 0 values for linear dependence due to 100 counts each year and matrix/linear algebra.
# Because each of the Hot 100 genre counts totaled to 100 for each year, this created a linear
# dependence in the models (non-full rank matrix).  To counteract this, we added .01 to the counts of 0
# which allowed the model matrices to be full rank and allowed the model to be fit, without 
# changing the results in any practical way.

Y1.L1<-ifelse(Y1.L1==0,Y1.L1+.01,Y1.L1)
Y2.L1<-ifelse(Y2.L1==0,Y2.L1+.01,Y2.L1)
Y3.L1<-ifelse(Y3.L1==0,Y3.L1+.01,Y3.L1)
Y486.L1<-ifelse(Y486.L1==0,Y486.L1+.01,Y486.L1)
Y5.L1<-ifelse(Y5.L1==0,Y5.L1+.01,Y5.L1)
Y7.L1<-ifelse(Y7.L1==0,Y7.L1+.01,Y7.L1)
Y9.L1<-ifelse(Y9.L1==0,Y9.L1+.01,Y9.L1)
Y10.L1<-ifelse(Y10.L1==0,Y10.L1+.01,Y10.L1)

# Including lag-1 responses as predictors in the model:

# No time predictor
fitbestLDVsH100 <- vglm(formula = cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~unemp.s+polit.L1+infl.L1+gdp.s+phys.s+sales_adj.s+sales_adj.L1+Y1.L1+Y2.L1+Y3.L1+Y5.L1+Y7.L1+Y486.L1+Y9.L1+Y10.L1, family=multinomial)
AIC(fitbestLDVsH100);BIC(fitbestLDVsH100)

# Adding time predictor
fitbestLDVsH1001 <- vglm(formula = cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~unemp.s+polit.L1+infl.L1+gdp.s+phys.s+sales_adj.s+sales_adj.L1+time+Y1.L1+Y2.L1+Y3.L1+Y5.L1+Y7.L1+Y486.L1+Y9.L1+Y10.L1, family=multinomial)
AIC(fitbestLDVsH1001);BIC(fitbestLDVsH1001)

# Adding time and time-squared predictors
fitbestLDVsH1002 <- vglm(formula = cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~unemp.s+polit.L1+infl.L1+gdp.s+phys.s+sales_adj.s+sales_adj.L1+time+timesq+Y1.L1+Y2.L1+Y3.L1+Y5.L1+Y7.L1+Y486.L1+Y9.L1+Y10.L1, family=multinomial);
AIC(fitbestLDVsH1002);BIC(fitbestLDVsH1002)

# Adding time, time-squared, and time-cubed predictors
fitbestLDVsH1003 <- vglm(formula = cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)~unemp.s+polit.L1+infl.L1+gdp.s+phys.s+sales_adj.s+sales_adj.L1+time+timesq+timecube+Y1.L1+Y2.L1+Y3.L1+Y5.L1+Y7.L1+Y486.L1+Y9.L1+Y10.L1, family=multinomial);
AIC(fitbestLDVsH1003);BIC(fitbestLDVsH1003)

# Using base R graphics:

# All response variables for each genre from 1975-2018.
alldataH<-cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)

# Store "best" model (here, fitbestLDVsH100) as generic fit7b object.
fit7b<-fitbestLDVsH100

# mypredmat contains model-predicted genre probabilities for each year.
# myobsmat contains observed genre proportions for each year.
time.vec.s <- seq(-21,22,by=1)
mypredmat<-fitted(fit7b)
myobsmat <- alldataH[,1:8]/rowSums(alldataH[,1:8])




dev.new()
# Using ggplot2
library(ggplot2)

# Since time is years 1975-2018, 1996 is the midpoint year that we center around.
pred.obs.probs <- data.frame(I(rep((time.vec.s+1996),times=8)),I(as.vector(mypredmat)),I(as.vector(myobsmat)),I(rep(dimnames(genres_PJtotal.new)[[2]][-c(4,6)][c(1:5,7,8,6)],each=44)) )
names(pred.obs.probs) <- c("Year","pred.probs","obs.props","genre.type")

# Despite using the dimension names of the Pazz & Jop data here, they match the Hot 100 data because it's the same genres.


P1 <- ggplot(data=pred.obs.probs, aes(x=Year)) + 
  geom_line(aes(y=pred.probs)) + 
  geom_point(aes(y=pred.probs, shape='predicted'))+
  geom_line(aes(y=obs.props)) + 
  geom_point(aes(y=obs.props, shape='observed'))+
 scale_shape(solid=FALSE)

P2 <- P1 + facet_wrap(vars(pred.obs.probs$genre.type),nrow=4,ncol=2, scales="free_y")

P2 + theme(legend.title = element_blank()) + 
theme(legend.position="top") + ggtitle("Observed proportions and model-predicted probabilities for each genre") + ylab("Probabilities")

#P2 + theme(legend.position = "none")


# Pazz & Jop
------------
#4 X 2 array of plots.

## Fitting some candidate models:

alldata2<-cbind(y1.s,y2.s,y3.s,y5.s,y7.s,y9.s,y10.s,y468.s)
# y468.s contains the Rock, Metal, and Punk genre counts (combined into "Rock") for 1975-2018.

# Store "best" model (here, fitbestLDVsB2) as generic fit7b object.
fit7b<-fitbestLDVsB2

# mypredmat contains the model-predicted genre probabilities for each year.
# myobsmat contains the observed genre proportions for each year.
time.vec.s <- seq(-21,22,by=1)
mypredmat<-fitted(fit7b)
myobsmat <- alldata2[,1:9]/rowSums(alldata2[,1:9])



dev.new()
# Using ggplot2
library(ggplot2)

# Since time is years 1975-2018, 1996 is the midpoint year that we center around.
pred.obs.probs <- data.frame(I(rep((time.vec.s+1996),times=9)),I(as.vector(mypredmat)),I(as.vector(myobsmat)),I(rep(dimnames(genres_PJtotal.new)[[2]][-4][c(1:6,8,9,7)],each=44)) )
names(pred.obs.probs) <- c("Year","pred.probs","obs.props","genre.type")

# Despite using the dimension names of the Pazz & Jop data here, they match the Hot 100 data because it's the same genres.


P1 <- ggplot(data=pred.obs.probs, aes(x=Year)) + 
  geom_line(aes(y=pred.probs)) + 
  geom_point(aes(y=pred.probs, shape='predicted'))+
  geom_line(aes(y=obs.props)) + 
  geom_point(aes(y=obs.props, shape='observed'))+
 scale_shape(solid=FALSE)

P2 <- P1 + facet_wrap(vars(pred.obs.probs$genre.type),nrow=5,ncol=2, scales="free_y")

P2 + theme(legend.title = element_blank()) + 
theme(legend.position="top") + ggtitle("Observed proportions and model-predicted probabilities for each genre") + ylab("Probabilities")

#P2 + theme(legend.position = none")


#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 6 OF CODE
# This block of code shows the results of our forecasts of genre proportions for 2019,
# both for the Hot 100 and the Pazz & Jop.
# The boxplots represent forecasts based on 200 different generated data sets.
# The reason for generating these 200 different data sets is twofold:
# (1) to reflect the uncertainty in the estimated multinomial model coefficients (the beta-hats)
# and (2) to reflect the fact that the unlagged versions of the 2019 predictors are unknown
# at the time of forecasting the 2019 responses, so we forecast possible values of these
# based on ARIMA fits to the corresponding predictor-variable time series.
# We use the jackknife method to estimate the variance-covariance matrix of the
# estimated coefficients of our multinomial logit models.
#######################################################################################
#######################################################################################
#######################################################################################

##____________________________________________________________________________________________________________
### Boxplots of 2019 forecasts, Pazz & Jop model, 8 categories:

# Quadratic Model fit

fitbestLDVsB2.RMP <- vglm(formula = cbind(y1.s,y2.s,y3.s,y5.s,y7.s,y9.s,y10.s,y468.s)~infl.L1+phys.s+time+timesq+y1.L1+y2.L1+y3.L1+y5.L1+y7.L1+y468.L1+y9.L1+y10.L1, family=multinomial)


library(VGAM)

# Whatever the final "best" model we pick is, just name it fitbestLDVs.
fitbestLDVs<-fitbestLDVsB2.RMP

# Saving estimated coefficients:
betahats<-coef(fitbestLDVs)

# estimated covariance matrix using a jackknife:
beta.matrix.store <- matrix(0,nrow=44,ncol=length(betahats))
beta.crossprod.store<-matrix(0,nrow=length(betahats),ncol=length(betahats))

for (k in 1:44){
fitbestLDVsB2.RMPk <- vglm(formula = cbind(y1.s[-k],y2.s[-k],y3.s[-k],y5.s[-k],y7.s[-k],y9.s[-k],y10.s[-k],y468.s[-k])~infl.L1[-k]+phys[-k]+time[-k]+timesq[-k]+y1.L1[-k]+y2.L1[-k]+y3.L1[-k]+y5.L1[-k]+y7.L1[-k]+y468.L1[-k]+y9.L1[-k]+y10.L1[-k], family=multinomial)
beta.matrix.store[k,] <- as.vector(coef(fitbestLDVsB2.RMPk))
}
mean.beta.matrix.store <- colMeans(beta.matrix.store)

for (k in 1:44){
crossprod.comp <- (beta.matrix.store[k,] - mean.beta.matrix.store) %*% t(beta.matrix.store[k,] - mean.beta.matrix.store)
beta.crossprod.store <- beta.crossprod.store + crossprod.comp
}

# The estimated variance-covariance matrix is stored in this object:
vcov_betahats_jackP <- (43/44)*beta.crossprod.store



library(MASS)

# Setting the number of simulated data sets to use (suggest 200?)
nsims<-200

# Simulating 200 sets of betas from a multivariate normal distribution:
betahats.sim<-mvrnorm(n = nsims, mu=betahats, Sigma=vcov_betahats_jackP)

library(forecast)
library(rje)
transf.phys.sim <- rep(0,times=nsims) #placeholder vector

# Fit the best ARIMA model to the transformed "proportion physical" series
#fit.auto.phys <-auto.arima(arcsintr(phys.s),ic='aic',max.p=10,max.q=10,max.d=5)
logit.phys.adj<-ifelse(phys.s==1,log( (phys.s+(1-max(phys.s[phys.s<1])))/ (1-max(phys.s[phys.s<1])) ),logit(phys.s))
fit.auto.phys <-auto.arima(logit.phys.adj,ic='aic',max.p=10,max.q=10,max.d=5)
#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
for (ii in 1:nsims){
transf.phys.sim[ii]<-simulate(fit.auto.phys,future=T,nsim=1,bootstrap=T)
}
# back-transform the simulated values:
#phys.sim<- revarcsintr(transf.phys.sim)
phys.sim<- expit(transf.phys.sim)

#sales_adj.sim <- rep(0,times=nsims) #placeholder vector

# fit the best ARIMA model to the sales_adj series
#fit.auto.sales <-auto.arima(sales_adj.s,ic='aic')

#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
#for (ii in 1:nsims){
#sales_adj.sim[ii]<-simulate(fit.auto.sales,future=T,nsim=1,bootstrap=T)
#}

# Set up placeholder matrices to store values:
fprobs<-matrix(0,nrow=nsims,ncol=8)
numer<-matrix(0,nrow=nsims,ncol=7)

#### This is for the quadratic time trend model that we chose as the best model for the Pazz & Jop data.

# For each of the non-baseline genres, calculate the numerator of the estimated probability expression:
for (j in 1:7){
numer[,j] <- exp(
 betahats.sim[,j] +  
betahats.sim[,j+7]*rep(infl.L1[length(infl.L1)],times=nsims) + 
betahats.sim[,j+14]*phys.sim + 
betahats.sim[,j+21]*rep(23,times=nsims) + 
betahats.sim[,j+28]*rep(23^2,times=nsims)+ 
betahats.sim[,j+35]*rep(y1.s[length(y1.s)],times=nsims) +
betahats.sim[,j+42]*rep(y2.s[length(y2.s)],times=nsims) +
betahats.sim[,j+49]*rep(y3.s[length(y3.s)],times=nsims) +
betahats.sim[,j+56]*rep(y5.s[length(y5.s)],times=nsims) +
betahats.sim[,j+63]*rep(y7.s[length(y7.s)],times=nsims) +
betahats.sim[,j+70]*rep(y468.s[length(y468.s)],times=nsims) +
betahats.sim[,j+77]*rep(y9.s[length(y9.s)],times=nsims) +
betahats.sim[,j+84]*rep(y10.s[length(y10.s)],times=nsims) 
)
}

# Calculate the denominator of the estimated probability expression:
denom<- 1 + rowSums(numer)

# Dividing each numerator by the denominator to get the estimated probabilities for each non-baseline genre.
for (j in 1:7){
fprobs[,j] <- numer[,j]/denom
}
# The probability for the baseline genre (Rock)
fprobs[,8] <- 1 - rowSums(fprobs[,1:7])

# Boxplots of the 200 sets of probability estimates:

dev.new()

###
### Plotting the boxplots of the 2019 Pazz & Jop forecasts
###

library(ggplot2)
dat1 <- stack(as.data.frame(fprobs))
dat1$ind <- rep(abbreviate(c('Country','Folk','Elec','Pop','Rap.HH','RBSoul','Other','Rock')),each=nsims)
p1 <- ggplot(dat1) +
  geom_boxplot(aes(x = ind, y = values))
p1aPJ<- p1 + ggtitle("2019 Simulated Forecasts, Pazz & Jop") +
  xlab("Genre") + ylab("Forecasted Probabilities")
p1aPJ



##Forecasting for Hot 100 (No time trend here), 8 predictors

# Whatever the final "best" model we pick is, just name it fitbestLDVsH.
fitbestLDVsH<-fitbestLDVsH100

# Saving estimated coefficients:
betahats<-coef(fitbestLDVsH)

# estimated covariance matrix using a jackknife:
beta.matrix.store <- matrix(0,nrow=44,ncol=length(betahats))
beta.crossprod.store<-matrix(0,nrow=length(betahats),ncol=length(betahats))

for (k in 1:44){
fitbestLDVsHk <- vglm(formula = cbind(Y1.s[-k], Y2.s[-k], Y3.s[-k], Y5.s[-k], Y7.s[-k], Y9.s[-k], Y10.s[-k], Y486.s[-k]) ~ unemp.s[-k] +     polit.L1[-k] + infl.L1[-k] + gdp.s[-k] + phys[-k] + sales_adj[-k] + sales_adj.L1[-k] +     Y1.L1[-k] + Y2.L1[-k] + Y3.L1[-k] + Y5.L1[-k] + Y7.L1[-k] + Y486.L1[-k] + Y9.L1[-k] +     Y10.L1[-k], family=multinomial)
beta.matrix.store[k,] <- as.vector(coef(fitbestLDVsHk))
}
mean.beta.matrix.store <- colMeans(beta.matrix.store)

for (k in 1:44){
crossprod.comp <- (beta.matrix.store[k,] - mean.beta.matrix.store) %*% t(beta.matrix.store[k,] - mean.beta.matrix.store)
beta.crossprod.store <- beta.crossprod.store + crossprod.comp
}

# The estimated variance-covariance matrix is stored in this object:
vcov_betahats_jackH <- (43/44)*beta.crossprod.store



library(MASS)

# Setting the number of simulated data sets to use (suggest 200?)
nsims<-200

# Simulating 200 sets of betas from a multivariate normal distribution:
betahats.sim<-mvrnorm(n = nsims, mu=betahats, Sigma=vcov_betahats_jackH)


library(forecast)
library(rje)
transf.phys.sim <- rep(0,times=nsims) #placeholder vector

# fit the best ARIMA model to the transformed "proportion physical" series
#fit.auto.phys <-auto.arima(arcsintr(phys.s),ic='aic',max.p=10,max.q=10,max.d=5)
logit.phys.adj<-ifelse(phys.s==1,log( (phys.s+(1-max(phys.s[phys.s<1])))/ (1-max(phys.s[phys.s<1])) ),logit(phys.s))
fit.auto.phys <-auto.arima(logit.phys.adj,ic='aic',max.p=10,max.q=10,max.d=5)
#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
for (ii in 1:nsims){
transf.phys.sim[ii]<-simulate(fit.auto.phys,future=T,nsim=1,bootstrap=T)
}
# back-transform the simulated values:
#phys.sim<- revarcsintr(transf.phys.sim)
phys.sim<- expit(transf.phys.sim)

sales_adj.sim <- rep(0,times=nsims) #placeholder vector

# fit the best ARIMA model to the sales_adj series
fit.auto.sales <-auto.arima(sales_adj.s,ic='aic')

#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
for (ii in 1:nsims){
sales_adj.sim[ii]<-simulate(fit.auto.sales,future=T,nsim=1,bootstrap=T)
}


#GDP simulation
gdp.sim <- rep(0,times=nsims) #placeholder vector

# fit the best ARIMA model to the gdp series
fit.auto.gdp <-auto.arima(gdp.s,ic='aic')

#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
for (ii in 1:nsims){
gdp.sim[ii]<-simulate(fit.auto.gdp,future=T,nsim=1,bootstrap=T)
}

#Unemployment simulation
unemp.sim <- rep(0,times=nsims) #placeholder vector

# fit the best ARIMA model to the unemp series
fit.auto.unemp <-auto.arima(unemp.s,ic='aic')

#Using the ARIMA fit, simulate a future value one year ahead (do this 'nsims' (200?) times)
for (ii in 1:nsims){
unemp.sim[ii]<-simulate(fit.auto.unemp,future=T,nsim=1,bootstrap=T)
}


# Set up placeholder matrices to store values:
fprobs<-matrix(0,nrow=nsims,ncol=8)
numer<-matrix(0,nrow=nsims,ncol=7)



# We leave this commented code as an example of the Hot 100 model fit with a cubic time trend,
# for which the output is discussed in the paper and the forecasts provided in the supplementary material.

#### This is for the cubic time trend fit:
#### Run this only if the best model is chosen to be
#### fitbestLDVsH<-fitbestLDVsH100
#### at the beginning of this section of code.

# For each of the non-baseline genres, calculate the numerator of the estimated probability expression:
#for (j in 1:7){
#numer[,j] <- exp(
# betahats.sim[,j] +  
#betahats.sim[,j+7]*unemp.sim +
#betahats.sim[,j+14]*rep(polit.s[length(polit.s)],times=nsims) + 
#betahats.sim[,j+21]*rep(infl.s[length(infl.s)],times=nsims) + 
#betahats.sim[,j+28]*gdp.sim + 
#betahats.sim[,j+35]*phys.sim + 
#betahats.sim[,j+42]*sales_adj.sim + 
#betahats.sim[,j+49]*rep(sales_adj[length(sales_adj)],times=nsims) +
#betahats.sim[,j+56]*rep(23,times=nsims) + 
#betahats.sim[,j+63]*rep(23^2,times=nsims)+ 
#betahats.sim[,j+70]*rep(23^3,times=nsims)+ 
#betahats.sim[,j+77]*rep(Y1.s[length(Y1.s)],times=nsims) +
#betahats.sim[,j+84]*rep(Y2.s[length(Y2.s)],times=nsims) +
#betahats.sim[,j+91]*rep(Y3.s[length(Y3.s)],times=nsims) +
#betahats.sim[,j+98]*rep(Y5.s[length(Y5.s)],times=nsims) +
#betahats.sim[,j+105]*rep(Y7.s[length(Y7.s)],times=nsims) +
#betahats.sim[,j+112]*rep(Y486.s[length(Y486.s)],times=nsims) +
#betahats.sim[,j+119]*rep(Y9.s[length(Y9.s)],times=nsims) +
#betahats.sim[,j+126]*rep(Y10.s[length(Y10.s)],times=nsims) 
#)
#}

# This is for the no-time fit:
# Note there are fewer estimated coefficients in the model without the time terms.



# # --- One of the sales.adj components is simulated: this is the "unknown" (but forecasted) 2019 adjusted sales value.
# For each of the non-baseline genres, calculate the numerator of the estimated probability expression:
for (j in 1:7){
numer[,j] <- exp(
 betahats.sim[,j] +  
betahats.sim[,j+7]*unemp.sim +
betahats.sim[,j+14]*rep(polit.s[length(polit.s)],times=nsims) + 
betahats.sim[,j+21]*rep(infl.s[length(infl.s)],times=nsims) + 
betahats.sim[,j+28]*gdp.sim + 
betahats.sim[,j+35]*phys.sim + 
betahats.sim[,j+42]*sales_adj.sim + 
betahats.sim[,j+49]*rep(sales_adj.s[length(sales_adj.s)],times=nsims) +
betahats.sim[,j+56]*rep(Y1.s[length(Y1.s)],times=nsims) +
betahats.sim[,j+63]*rep(Y2.s[length(Y2.s)],times=nsims) +
betahats.sim[,j+70]*rep(Y3.s[length(Y3.s)],times=nsims) +
betahats.sim[,j+77]*rep(Y5.s[length(Y5.s)],times=nsims) +
betahats.sim[,j+84]*rep(Y7.s[length(Y7.s)],times=nsims) +
betahats.sim[,j+91]*rep(Y486.s[length(Y486.s)],times=nsims) +
betahats.sim[,j+98]*rep(Y9.s[length(Y9.s)],times=nsims) +
betahats.sim[,j+105]*rep(Y10.s[length(Y10.s)],times=nsims) 
)
}

# calculate the denominator of the estimated probability expression:
denom<- 1 + rowSums(numer)

# dividing each numerator by the denominator to get 
# the estimated probabilities for each non-baseline genre
for (j in 1:7){
fprobs[,j] <- numer[,j]/denom
}
# the probability for the baseline genre (Rock)
fprobs[,8] <- 1 - rowSums(fprobs[,1:7])

####
# boxplots of the 200 sets of probability estimates:
####

### the below code is OK!
#  (these data are the real 2019 Hot 100 genre proportions for the 8 genres):
true2019props<-data.frame(genres=abbreviate(c('Country','Folk','Elec','Pop','Rap.HH','RBSoul','Other','Rock')),
trueprops=c(.15,.00,.03,.31,.40,.05,.03,.03) )
## Put true proportions in this order:
#"Cntr" "Folk" "Elec" "Pop"  "R.H." "RBSl" "Othr" "Rock"

###
### Plotting the boxplots of the 2019 Hot 100 forecasts, 
### along with the X marking the real 2019 Hot 100 genre proportions
###

library(ggplot2)
dat1 <- stack(as.data.frame(fprobs))
dat1$ind <- rep(abbreviate(c('Country','Folk','Elec','Pop','Rap.HH','RBSoul','Other','Rock')),each=nsims)
p1 <- ggplot(dat1) +
  geom_boxplot(aes(x = ind, y = values)) +
  geom_point(
  data=true2019props,
  aes(x=genres,y=trueprops),
  size=3, shape=4
  )
p1aH100_2<- p1 + ggtitle("2019 Simulated Forecasts, Hot 100") +
  xlab("Genre") + ylab("Forecasted Probabilities")
p1aH100_2


###
### The code below is just for plotting the results on one plot.
###

########  
# Multiplot function for later: just run this once:

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

# Creating pdf of both sets of boxplots on one plot...
#multiplot(p1a, p4a, p2a, p5a,p3a,p6a, cols=3)
#pdf("forecasts_2019_both_on_same_plot.pdf")
multiplot(p1aH100_2, p1aPJ, cols=1) #better, With H100 2019 forecasts and P&J 2019 forecasts on the same Figure?
#dev.off()

#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 7 OF CODE:
# This block of code is for the time series analysis of the Cramer's V measures of association
# described in the paper.
# It calculates Cramer's V separately at each year, for the 2 by 8 table of genre counts by 
# ranking type (Hot 100 or Pazz & Jop).
# Then we fit ARIMA models to the series of Cramer's V values, and pick the best ARIMA model.
# We produce model diagnostics like ACF and PACF plots for the fitted ARIMA model.
# This code produces forecasts of the Cramer's V for future years (2019-2023)
# and plots these forecasts along with the Cramer's V series from 1974-2018.
# The uncertainty in the forecasts is shown by the simulated future (2019-2023) trajectories of the Cramer's V series.
#######################################################################################
#######################################################################################
#######################################################################################


##_________________________________________________________________________________________________________________
##Cramer's V

#UPDATE
# Pazz & Jop genre counts
alldata2<-cbind(y1.s,y2.s,y3.s,y5.s,y7.s,y9.s,y10.s,y468.s)
alldata2.L<-cbind(y1.L1,y2.L1,y3.L1,y5.L1,y7.L1,y9.L1,y10.L1,y468.L1)
#complete P&J genre counts
alldata_comb<-rbind(alldata2.L[1,],alldata2)

# Hot 100 genre counts
alldataH<-cbind(Y1.s,Y2.s,Y3.s,Y5.s,Y7.s,Y9.s,Y10.s,Y486.s)
alldataH.L<-c(10,11,1,11,0,34,3,30) #lagged values without 0.01 adjustment from modeling; 1974 counts
#complete H100 genre counts
alldata_combH<-rbind(alldataH.L,alldataH)

# Cramer's V Function
# Copy and paste this function first:

CramerV <- function(xtable){
  xtable<-xtable[,colSums(xtable)!=0]
  V<-sqrt(chisq.test(xtable, correct = FALSE)$statistic / (sum(xtable) * min(dim(xtable) - 1 )))
  names(V) <- NULL
  return(V)
}

#We had to add another line before to remove the sums with 0 values for both PJ and Hot100 
#This is the line: xtable<-xtable[,colSums(xtable)!=0]
# in the above code
#the zero value was creating a divide by zero error even after collapsing some of the genres

# Need both Pazz & Jop and Hot 100 data in table format for "xtable" variable.
#Pazz & Jop and Hot 100 combined in one matrix

CramerVvec<-rep(0,nrow(alldata_combH))
for(i in 1:nrow(alldata_combH))
{
  mymat=rbind(alldata_comb[i,],alldata_combH[i,])
  CramerVvec[i]<-CramerV(mymat)
}


library(astsa)
library(TSA)

# Cramer's V vector as a time series object from 1974-2018.
ts.Cramers<-ts(CramerVvec,start=1974,end=2018)
plot(ts.Cramers)

# Model diagnostics
acf(ts.Cramers)
pacf(ts.Cramers)


library(forecast)
# this package may interfere w plot() so can uninstall between uses.

# Fit a ARIMA model to Cramer's V time series.
cramers.fit<-auto.arima(ts.Cramers)

# Compute logit transformation on the 0 to 1 Cramer's data.
# Makes the values range from -inf to inf.
logit.ts.Cramers<-log(ts.Cramers/(1-ts.Cramers))

# Fit a ARIMA model to logit-transformed Cramer's V time series.
cramers.log.fit<-auto.arima(logit.ts.Cramers)

# Model diagnostics for logit-transformed Cramer's V time series' model; 
# Includes residual plot, autocorrelations plot, and quantile-quantile plot
plot(resid(cramers.log.fit),type="o")
acf(resid(cramers.log.fit))
qqnorm(resid(cramers.log.fit))

# Model diagnotics for Cramer's V time series' model; 
# Includes residual plot, autocorrelations plot, and quantile-quantile plot
plot(resid(cramers.fit),type="o")
acf(resid(cramers.fit))
qqnorm(resid(cramers.fit))

#Shapiro Wilks test for Normality (as opposed to quantile-quantile evaluation)
#H0 is that the data is Normal... so a small p-value rejecting H0 thus rejects Normality
shapiro.test(resid(cramers.fit)) #will see not normal resids!

shapiro.test(resid(cramers.log.fit))
#use logit transformed data and back transform after prediction/forecasting analyses

# Inverse logit transformation
# exp(x)/(1+exp(x))

# Changed the order from (0,0,1) to (0,1,1) to fit the nonstationary ARIMA model we chose:
# Use auto.arima() rather than arima() to fit the ARIMA(0,1,1) model.
# auto.arima() will pick the ARIMA(0,1,1) choice (verify by printing the result)
# but unlike arima(), it will include a drift (constant) term in the model,
# which will make the forecasts more appropriate with this nonstationary series.

library(forecast);library(TSA)

# This is the number of simulated future trajectories.  
# Let's see how using 20 gray trajectories looks on the plot:
number.paths <- 20

# number of years into the future we are forecasting
# This would be forecasting Cramer's V for the years 2019 to 2023.
n.future.years <- 5

future.path.mat <- matrix(0,nrow=number.paths, ncol=n.future.years)
for (i in 1:number.paths){
future.path <- simulate(cramers.log.fit,future=T,bootstrap=T,nsim=n.future.years)
future.path.mat[i,] <- as.vector(future.path)
}
future.path.mat.back <- exp(future.path.mat)/(1+exp(future.path.mat))

# Need to use sarima.for() rather than arima() to forecast with NONSTATIONARY series 
# when there is a constant (drift) term in the model
# as there is here.

library(astsa)
cramer.pred<-sarima.for(logit.ts.Cramers,n.future.years,0,1,1)$pred
cramer.se<-sarima.for(logit.ts.Cramers,n.future.years,0,1,1)$se

# Get rid of this because the 1.96 is assuming normal errors.
# We don't really need the prediction intervals if we show the simulated trajectories.
#cramer.LCL<-cramer.pred - 1.96*cramer.se
#cramer.UCL<-cramer.pred + 1.96*cramer.se

back<-exp(cramer.pred)/(1+exp(cramer.pred))

#print(back)

# Changed the years below to reflect the fact that our observed time series is now from 1974 to 2018 (45 years in all)
# and the years we are forecasting for are 2019 to 2023:
total_predict<-c(ts.Cramers,back)

dev.new()
plot(2018:2023,c(total_predict[45],future.path.mat.back[1,]), xlab='Year', ylab='Cramer\'s V', type='l', col='gray', xlim=c(1974,2023), ylim=c(0.2,0.7))
## MIGHT NEED TO CHANGE THE ylim ABOVE!  Need to see the plot to be sure.
for (i in 2:number.paths){
lines(2018:2023,c(total_predict[45],future.path.mat.back[i,]),type='l',col='gray')
}
lines(2018:2023,total_predict[45:50],type="o",lty=5, lwd=1.5) # can play with 'lwd' to make the line thicker?
lines(1974:2018,total_predict[1:45],type="o",lty=1)

#######################################################################################
#######################################################################################
#######################################################################################
## BLOCK 8 OF CODE
# This last block of code is for the Cross-correlation function discussed in the Cramer's V section.
# Prewhitening based on ARIMA models are used.
# This is to explore the possible association (over the latter part of the time period)
# between the Cramer's V association measure and the proportion of physical (vs. streaming) sales.
#######################################################################################
#######################################################################################
#######################################################################################

##_______________________________________________________________________________________________________________
##Cross-Correlation Function

phys.ts<-ts(c(1,as.vector(phys)),start=1974)
logit.phys.ts <- ts(c(4.209684751,as.vector(logit.phys.adj)),start=1974)

cramers.v<-c(0.3104528,0.2868408,0.3835920,0.3049482,0.3466255,
0.2999057,0.2629912,0.3006844,0.3461504,0.3115307,
0.3129164,0.2657280,0.4454695,0.3998983,0.5819051,
0.4611803,0.3654158,0.4160336,0.4480346,0.3965001,
0.4491523,0.4522918,0.4472814,0.5509137,0.4889326,
0.3817765,0.4763275,0.4355243,0.5632115,0.4653838,
0.5188691,0.4736694,0.4146267,0.4850674,0.5253197,
0.4473457,0.4766530,0.5643806,0.5294966,0.4603728,
0.4473651,0.4698404,0.5742811,0.4438992,0.4260084)

logit.cramers.v <- logit(cramers.v)

logit.cramers.v.ts <- ts(logit.cramers.v, start=1974)

# prewhitening with ARIMA(0,1,1) model on logit Cramers V:
m1=arima(logit.cramers.v.ts,order=c(0,1,1))
#prewhiten(x=logit.cramers.v.ts,y=logit.phys.ts,x.model=m1)


library(forecast)
x <- logit.cramers.v.ts - fitted(Arima(logit.cramers.v.ts, model = m1))
y <- logit.phys.ts - fitted(Arima(logit.phys.ts, model = m1))
ccf(x, y)

# prewhitening with ARIMA(0,2,2) model on logit phys:
m2=arima(logit.phys.ts,order=c(0,2,2))
#prewhiten(x=logit.phys.ts,y=logit.cramers.v.ts,x.model=m2)

y <- logit.cramers.v.ts - fitted(Arima(logit.cramers.v.ts, model = m2))
x <- logit.phys.ts - fitted(Arima(logit.phys.ts, model = m2))
ccf(x, y)

# Just focusing on the cross-correlation once streaming became popular, since 1990...
cutoffyr<-16
logit.phys.ts.s<-ts(logit.phys.ts[-(1:cutoffyr)],end=2018)
logit.cramers.v.ts.s <- ts(logit.cramers.v.ts[-(1:cutoffyr)], end=2018)

library(forecast)
#m1=auto.arima(logit.cramers.v.ts.s)
# Treat logit.phys.ts.s as the 'x' series:
m2=auto.arima(logit.phys.ts.s)

y <- logit.cramers.v.ts.s - fitted(Arima(logit.cramers.v.ts.s, model = m2))
x <- logit.phys.ts.s - fitted(Arima(logit.phys.ts.s, model = m2))
ccf(x, y, ylab='Cross-correlation',main='')
title("CCF between logit-transformed and \n prewhitened proportion physical and Cramer's V")
